package spider.hysrlzy;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import spider.Capturer;

import java.io.IOException;
import java.net.URL;
import java.util.*;

public class UserDataCapturer implements Capturer {
    @Override
    public List<String> capture(String url) {
        try {
            System.out.println(url);
            if (!url.matches(".+Resumes/\\d+.html")) {
                return Collections.emptyList();
            }
            List<String> ret = new ArrayList<>();
            URL u = new URL(url);
            Document doc = null;
            doc = Jsoup.parse(u, 5000);
            String s1 = doc.selectFirst(".div_basic .bas_word table tr:nth-child(1) td:nth-child(2)").text();
            String s2 = doc.selectFirst(".div_basic .bas_word table tr:nth-child(1) td:nth-child(4)").text();
            String s3 = doc.selectFirst(".div_basic .bas_word table tr:nth-child(2) td:nth-child(2)").text();
            String s4 = doc.selectFirst(".div_basic .bas_word table tr:nth-child(2) td:nth-child(4)").text();
            String s5 = doc.selectFirst(".div_basic .bas_photo img").attr("src");
            ret.add(s1);
            ret.add(s2);
            ret.add(s3);
            ret.add(s4);
            ret.add(s5);
            ret.add(url);
            return ret;
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 出错返回空集合
        return Collections.emptyList();
    }
}
